zinbModelFromW <- function(counts, sim_W){
  mod <- model.matrix(~ sim_W)
  zinb_sim <- zinbFit(counts, ncores = 3, X=mod, which_X_mu=1:3, which_X_pi=1:3, commondispersion=FALSE)
  true_alpha_mu <- zinb_sim@beta_mu[-1,]
  true_alpha_pi <- zinb_sim@beta_pi[-1,]
  true_beta_mu <- zinb_sim@beta_mu[1,,drop=FALSE]
  true_beta_pi <- zinb_sim@beta_pi[1,,drop=FALSE]
  true_gamma_mu <- zinb_sim@gamma_mu
  true_gamma_pi <- zinb_sim@gamma_pi
  true_zeta <- zinb_sim@zeta
  zinbModel(W=sim_W, alpha_mu=true_alpha_mu, alpha_pi=true_alpha_pi,
            beta_mu=true_beta_mu, beta_pi=true_beta_pi,
            zeta = true_zeta, gamma_mu=true_gamma_mu,
            gamma_pi=true_gamma_pi)
}

Simulation plan

simSummary = data.frame(realData = rep('Allen', 3),
                        K = rep(2, 3),
                        nclusters = rep(3, 3),
                        corMuPi = rep('no', 3),
                        dim =  rep('1000x100', 3),
                        zeroFract = c(.25, .5, .75),
                        nreplicates = 10)
kable(simSummary)                    
realData K nclusters corMuPi dim zeroFract nreplicates
Allen 2 3 no 1000x100 0.25 10
Allen 2 3 no 1000x100 0.50 10
Allen 2 3 no 1000x100 0.75 10

To simulate a dataset, steps are as follow

  1. Filter out the genes that do not have at least 5 counts in at least 5 cells.
  2. Sort genes by decreasing variance.
  3. Fit zinb to get W.
  4. Fit multivariate gaussians using mclust package and number of clusters = 3.
  5. Simulate W from multivariate gaussians. It becomes the truth W.
    For each zero fraction zFrac (25, 50, and 75):
  6. Select genes to get zero fraction zFrac. Bins of genes are created according to percentage of zeros. For example, genes with percentage of zeros between <20%, 20%-60%, >60%.
  7. Get true \(\gamma_{\mu}\), \(\gamma_{\pi}\), \(\alpha_{\mu}\), \(\alpha_{\pi}\), \(\beta_{\mu}\), \(\beta_{\pi}\), and \(\zeta\) by fitting with genewise dispersion \[ln(\mu) = (1, trueW) (\beta_{\mu}, \alpha_{\mu}) + (V\gamma_{\mu})^T,\] \[\text{logit}(\pi) = (1, trueW) (\beta_{\pi}, \alpha_{\pi}) + (V\gamma_{\pi})^T.\]
  8. Create zinb model = zinbModel(trueW, \(\hat{\beta_{\mu}}\),\(\hat{\beta_{\pi}}\), \(\hat{\gamma_{\mu}}\), \(\hat{\gamma_{\pi}}\), \(\hat{\alpha_{\mu}}\), \(\hat{\alpha_{\pi}}\), \(\zeta\)).
  9. Simulate counts using zinbSim(model).

Using this procedure, we have the simulated counts and true \(W\), \(\gamma_{\mu}\), \(\gamma_{\pi}\), \(\alpha_{\mu}\), \(\alpha_{\pi}\), \(\beta_{\mu}\), \(\beta_{\pi}\), and \(\zeta\).

ZINB model

Select cells

data("allen")
prefilter <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
                    which(colData(allen)$Core.Type=="Core")]

filterGenes = apply(assay(prefilter) > 5, 1, sum) >= 5
postfilter <- prefilter[filterGenes, ]

core <- assay(postfilter)
vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]

bioIni =  as.factor(colData(postfilter)$driver_1_s)
cl <- as.factor(colData(postfilter)$Primary.Type)
pZero <- rowMeans(log1p(core) == 0)
names(pZero) <- rownames(core)
sum(core == 0)/(nrow(core) * ncol(core))
## [1] 0.339014

The dimensions are

dim(core)
## [1] 1000  285
par(mfrow = c(1,2))
fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=cols[bioIni], pch=19, main="PCA of log-counts")
#legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts")

Fit data with K = 2, V and X intercepts in both Mu and Pi, commondispersion = FALSE, and no covariate.

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 2,
                                  commondispersion = FALSE)))
##    user  system elapsed 
## 342.893 180.993 207.666
par(mfrow = c(1,1))

# zinb
xlim = c(min(zinb@W[,1]) - .5, max(zinb@W[,1]) + .5) 
ylim = c(min(zinb@W[,2]) - .5, max(zinb@W[,2]) + .5)
plot(zinb@W, col = cols[bioIni], xlim = xlim, ylim = ylim,
     main = 'Fitted W')

# mclust
mm = Mclust(zinb@W, G = 3)
plot(mm, what = 'classification', xlab = 'W1', ylab = 'W2',
     xlim = xlim, ylim = ylim)

# multivar gaussian
sim_W = lapply(mm$classification, function(i){
  mvrnorm(n = 1, mu = mm$parameters$mean[, i],
        Sigma = mm$parameters$variance$sigma[,, i])
})
sim_W = do.call(rbind, sim_W)
bio = mm$classification
plot(sim_W, col = bio,
     main = 'true W (multivar gauss sim)',
     xlab = 'W1', ylab = 'W2', xlim = xlim, ylim = ylim)

Simulate true parameters

50%

core = assay(postfilter)
pZero <- rowMeans(log1p(core) == 0)
names(pZero) = rownames(core)
chosenGenes = c(sample(names(pZero)[pZero > 0.75], 100),
                sample(names(pZero)[pZero<0.75 & pZero>0.6],300),
                sample(names(pZero)[pZero<0.6 & pZero>0.2], 500),
                sample(names(pZero)[pZero<0.2], 100))
core50 = core[chosenGenes, ]
sum(core50 == 0)/(nrow(core50) * ncol(core50))
## [1] 0.4931895
simModel = simModel50 = zinbModelFromW(core50, sim_W)
simData = simData50 = zinbSim(simModel, seed = 8746)
detection_rate <- colSums(core50>0)
coverage <- colSums(core50)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
                 gamma_pi = simModel@gamma_pi[1, ],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=bio)

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.2833412      0.4483783  0.9416495
## gamma_pi       -0.2833412  1.0000000     -0.9542971 -0.3960176
## detection_rate  0.4483783 -0.9542971      1.0000000  0.5218290
## coverage        0.9416495 -0.3960176      0.5218290  1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
                 alpha_mu_2 = simModel@alpha_mu[2, ],
                 alpha_pi_1 = simModel@alpha_pi[1, ],
                 alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')

print(cor(df, method="spearman"))
##             alpha_mu_1  alpha_mu_2  alpha_pi_1 alpha_pi_2
## alpha_mu_1  1.00000000  0.00847088 -0.12261899  0.1081034
## alpha_mu_2  0.00847088  1.00000000  0.01868779 -0.1116836
## alpha_pi_1 -0.12261899  0.01868779  1.00000000  0.1517242
## alpha_pi_2  0.10810337 -0.11168357  0.15172423  1.0000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
                 beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')

print(cor(df, method="spearman"))
##            beta_mu    beta_pi
## beta_mu  1.0000000 -0.1067677
## beta_pi -0.1067677  1.0000000

25%

chosenGenes = c(sample(names(pZero)[pZero > 0.4], 110),
                sample(names(pZero)[pZero<0.4 & pZero>0.2], 490),
                sample(names(pZero)[pZero<0.2], 400))

core25 <- core[chosenGenes, ]
sum(core25 == 0)/(nrow(core25) * ncol(core25))
## [1] 0.2494596
simModel = simModel25 = zinbModelFromW(core25, sim_W)
simData = simData25 = zinbSim(simModel, seed = 8746)
detection_rate <- colSums(core25>0)
coverage <- colSums(core25)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
                 gamma_pi = simModel@gamma_pi[1, ],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=bio)

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.3680141      0.5131362  0.9357462
## gamma_pi       -0.3680141  1.0000000     -0.9523503 -0.3982634
## detection_rate  0.5131362 -0.9523503      1.0000000  0.5071295
## coverage        0.9357462 -0.3982634      0.5071295  1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
                 alpha_mu_2 = simModel@alpha_mu[2, ],
                 alpha_pi_1 = simModel@alpha_pi[1, ],
                 alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')

print(cor(df, method="spearman"))
##             alpha_mu_1  alpha_mu_2  alpha_pi_1 alpha_pi_2
## alpha_mu_1  1.00000000 -0.05371179 -0.19891845  0.1373080
## alpha_mu_2 -0.05371179  1.00000000  0.04851486 -0.2007947
## alpha_pi_1 -0.19891845  0.04851486  1.00000000  0.1900043
## alpha_pi_2  0.13730799 -0.20079465  0.19000434  1.0000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
                 beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')

print(cor(df, method="spearman"))
##            beta_mu    beta_pi
## beta_mu  1.0000000 -0.2206999
## beta_pi -0.2206999  1.0000000

75%

chosenGenes = c(sample(names(pZero)[pZero > 0.8], 700),
                sample(names(pZero)[pZero < 0.8 & pZero > 0.3], 200),
                sample(names(pZero)[pZero < 0.3], 100))

core75 <- core[chosenGenes, ]
sum(rowSums(core75) == 0)
## [1] 0
core75 = core75[rowSums(core75) != 0, ]
sum(core75 == 0)/(nrow(core75) * ncol(core75))
## [1] 0.7567018
simModel = simModel75 = zinbModelFromW(core75, sim_W)
simData = simData75 = zinbSim(simModel, seed = 8746)
detection_rate <- colSums(core75>0)
coverage <- colSums(core75)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
                 gamma_pi = simModel@gamma_pi[1, ],
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=bio)

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.2657099      0.4290702  0.9109961
## gamma_pi       -0.2657099  1.0000000     -0.9561455 -0.4233761
## detection_rate  0.4290702 -0.9561455      1.0000000  0.5332160
## coverage        0.9109961 -0.4233761      0.5332160  1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
                 alpha_mu_2 = simModel@alpha_mu[2, ],
                 alpha_pi_1 = simModel@alpha_pi[1, ],
                 alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')

print(cor(df, method="spearman"))
##            alpha_mu_1 alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1  1.0000000  0.1159766 0.12211908 0.14437814
## alpha_mu_2  0.1159766  1.0000000 0.07254640 0.13098009
## alpha_pi_1  0.1221191  0.0725464 1.00000000 0.07101794
## alpha_pi_2  0.1443781  0.1309801 0.07101794 1.00000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
                 beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')

print(cor(df, method="spearman"))
##            beta_mu    beta_pi
## beta_mu  1.0000000 -0.1395131
## beta_pi -0.1395131  1.0000000

ZINB simulation

50% percentage of zeros

core = core50
simData = simData50
simModel = simModel50
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)

par(mfrow=c(1, 2))
plot(rowMeans(getPi(simModel)), sim_detection_rate,
     xlab="Average simulated Pi", ylab="True Detection Rate",
     pch=19, col=bio, ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(4, 7),
     xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
     col=bio,ylim = c(9,14))

pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
              xlab = "Mean Expression", main = 'Simulated',
              ylab = "Dropout Rate",ylim = c(0,1),
              colramp = pal)

25% percentage of zeros

core = core25
simData = simData25
simModel = simModel25
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)

par(mfrow=c(1, 2))
plot(rowMeans(getPi(simModel)), sim_detection_rate,
     xlab="Average simulated Pi", ylab="True Detection Rate",
     pch=19, col=bio, ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(4, 7),
     xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
     col=bio,ylim = c(9,14))

pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
              xlab = "Mean Expression", main = 'Simulated',
              ylab = "Dropout Rate",ylim = c(0,1),
              colramp = pal)

75% percentage of zeros

simData = simData75
simModel = simModel75
core = core75
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)

par(mfrow=c(1, 2))
plot(rowMeans(getPi(simModel)), sim_detection_rate,
     xlab = "Average simulated Pi", ylab = "True Detection Rate",
     pch = 19, col = bio, ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(4, 7),
     xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
     col=bio,ylim = c(9,14))

pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
              xlab = "Mean Expression", main = 'Simulated',
              ylab = "Dropout Rate",ylim = c(0,1),
              colramp = pal)

10 replicates

B = 10
bio = as.vector(bio)
originalCounts = core50
simModel = simModel50
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData, file = "./datasets/sim_allen5.rda")
zero5 = sapply(simData, function(x) x$zeroFraction)
originalCounts = core25
simModel = simModel25
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData, file = "./datasets/sim_allen25.rda")
zero25 = sapply(simData, function(x) x$zeroFraction)
originalCounts = core75
simModel = simModel75
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData,
     file = "./datasets/sim_allen75.rda")
zero75 = sapply(simData, function(x) x$zeroFraction)
zeroFrac = data.frame(zero = c(zero5, zero25, zero75),
                      perc = c(rep(.5, 10), rep(.25, 10), rep(.75, 10)))
zeroFrac$perc = factor(zeroFrac$perc)
zeroFrac = melt(zeroFrac)
ggplot(zeroFrac, aes(x = perc, y = value)) + geom_boxplot() + 
  ylab('zero fraction') + xlab('wanted zero fraction') + ggtitle('one boxplot = 10 replicates')